#include <blitz/timer.h>

using namespace blitz;

void report(const char* name, Timer& timer, int N,
    long int iterations);
void unfused(int N);
void fused(int N);
void chunky(int N);

float* _bz_restrict a;
float* _bz_restrict b;
float* _bz_restrict c;
float* _bz_restrict d;
float* _bz_restrict e;
float* _bz_restrict f;

/*
 * Code to support the "fuse" macro
 */

int _chunk;
bool _done_chunks;
int _chunk_size = 512;

#define  fuse    _chunk = 0; _done_chunks = false;    \
                 for (; !_done_chunks; ++_chunk)

int main(int argc, char** argv)
{
    if (argc == 2)
        _chunk_size = atoi(argv[1]);

    cout << "Using chunk size " << _chunk_size << endl;

    const int N = 100000;

    a = new float[N];
    b = new float[N];
    c = new float[N];
    d = new float[N];
    e = new float[N];
    f = new float[N];

    for (int i=0; i < N; ++i)
    {
        a[i] = i;
        b[i] = i;
        c[i] = i;
        d[i] = i;
    }

    Timer timer;
    long int iterations = 100;

    timer.start();
    for (long i=0; i < iterations; ++i)
        unfused(N);
    timer.stop();

    report("Unfused", timer, N, iterations);

    timer.start();
    for (long i=0; i < iterations; ++i)
        fused(N);
    timer.stop();

    report("Fused", timer, N, iterations);

    timer.start();
    for (long i=0; i < iterations; ++i)
        chunky(N);
    timer.stop();

    report("Chunky", timer, N, iterations);

    return 0;
}

void report(const char* name, Timer& timer, int N,
    long int iterations)
{
    float flops = float(N) * iterations * 2;
    float Mflops = flops / timer.elapsedSeconds() / 1e+6;
    cout << setw(20) << name << " " << Mflops << " Mflops/s" << endl;
}

void __sink() { }

void unfused(int N)
{
    for (int i=0; i < N; ++i)
        e[i] = a[i] * b[i] + c[i] * d[i];

    __sink();

    for (int i=0; i < N; ++i)
        f[i] = c[i] * b[i] + a[i] * d[i];
}

void fused(int N)
{
    for (int i=0; i < N; ++i)
    {
        e[i] = a[i] * b[i] + c[i] * d[i];
        f[i] = c[i] * b[i] + a[i] * d[i];
    }
}


// This "chunky" routine is a simulated implementation of
// expression templates with tiling across multiple statements
// (the "chunky fusion" approach).  This code would be
// generated by:
//
//     fuse {
//         E = A*B + C*D;
//         F = C*B + A*D;
//     }

void chunky(int N)
{
    fuse {

        {   // Code generated by E = A*B + C*D;
            int lbound = _chunk * _chunk_size;
            int uboundp1 = lbound + _chunk_size;

            if (uboundp1 > N)
            {
                _done_chunks = true;
                uboundp1 = N;
            }

            for (int i=lbound; i < uboundp1; ++i)
                e[i] = a[i] * b[i] + c[i] * d[i];
        }

        __sink();

        {   // Code generated by F = C*B + A*D;
            int lbound = _chunk * _chunk_size;
            int uboundp1 = lbound + _chunk_size; 

            if (uboundp1 > N)
            {
                _done_chunks = true;
                uboundp1 = N;
            }

            for (int i=lbound; i < uboundp1; ++i)
                f[i] = c[i] * b[i] + a[i] * d[i];
        }
    }
}

